* define unique game
sort season matchday hteam_id ateam_id matchobs minute
egen game_id=seq(), by(season matchday hteam_id ateam_id)

*** travel distance - filling 4 missing obs with missing information about distance
gen team1=hteam_id if hteam_id<ateam_id
gen team2=ateam_id if hteam_id<ateam_id
replace team1=ateam_id if hteam_id>ateam_id
replace team2=hteam_id if hteam_id>ateam_id
egen dist_places=max(distance), by(team1 team2)
* there are only 4 observation with missing distance
replace distance=dist_places if missing(distance)
gen dist2=distance^2

replace h_matches=matchday if h_matches==.
replace a_matches=matchday if a_matches==.

*** variables for "game attractiveness": goals scored in the average game by each team, likelihood of a win
foreach var of varlist h_goals h_goals_against h_wins {
gen `var'_per_game=`var'/h_matches
}
foreach var of varlist a_goals a_goals_against a_wins {
gen `var'_per_game=`var'/a_matches
}

qui: tobit guest_fans i.division i.season distance dist2 i.hteam_id i.ateam_id i.day_week h_goals_per_game-a_wins_per_game h_standing a_standing, ll(0)
predict predicted_guest_fans if game_id==1, ystar(0,.)
corr guest_fans predicted_guest_fans

egen predict_guest_fans=max(predicted_guest_fans), by(season matchday hteam_id ateam_id)
